Problem Definition

Aim

Setup

Setting Working Directory

# Add user working directory path.
setwd("~/Documents/RMIT/TimeSeriesAnalysis/A2")

# Verify present working directory.
getwd()
## [1] "/Users/chris/Documents/RMIT/TimeSeriesAnalysis/A2"

Dependencies

# Code to install dependencies.
# install.packages(c('tidyverse', 'TSA', 'forecast', 'lmtest', 'tseries', 'urca'))

# Import Dependencies.
library(tidyverse)
library(TSA)
library(forecast)
library(lmtest)
library(urca)
library(tseries)

# Source Utility Script
source("./utilities.R")

Data

# Reading Data into R environment.
sunspot_data <- read.csv('./Resources/YearlySunspotData.csv', sep = ';', header = FALSE)

# Extract the first two columns
sunspot_data <- sunspot_data[, 1:2]

# Assign column names
colnames(sunspot_data) <- c("Year","MeanSunspotNumber")

# Display Sunspot Data
sunspot_data %>% head(15)
##      Year MeanSunspotNumber
## 1  1700.5               8.3
## 2  1701.5              18.3
## 3  1702.5              26.7
## 4  1703.5              38.3
## 5  1704.5              60.0
## 6  1705.5              96.7
## 7  1706.5              48.3
## 8  1707.5              33.3
## 9  1708.5              16.7
## 10 1709.5              13.3
## 11 1710.5               5.0
## 12 1711.5               0.0
## 13 1712.5               0.0
## 14 1713.5               3.3
## 15 1714.5              18.3
# Calculate number of years between 1700 - 2023 (324 years)
true_years = (2023 - 1700) + 1 

# Verify year count with true year count
cat("Number of years between 1700 - 2023 (inclusive): ", true_years, 
    "\nNumber of years accounted for out of 324 years: ", sunspot_data %>% nrow(),
    "\nCount of null values in the data set: ", sum(is.na(sunspot_data$MeanSunspotNumber)))
## Number of years between 1700 - 2023 (inclusive):  324 
## Number of years accounted for out of 324 years:  324 
## Count of null values in the data set:  0

Convert to TS Object

# Convert data to time series object
sunspot_TS <- sunspot_data$MeanSunspotNumber %>% ts(start = 1700,
                                                     end = 2023,
                                                     frequency = 1)
# View first 10 years of the series
sunspot_TS %>% head(10)
## Time Series:
## Start = 1700 
## End = 1709 
## Frequency = 1 
##  [1]  8.3 18.3 26.7 38.3 60.0 96.7 48.3 33.3 16.7 13.3

EDA

Exploration

# Verify Data type 
sunspot_TS %>% class()
## [1] "ts"
# Summarize the time series object
sunspot_TS %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   24.57   65.55   78.53  115.47  269.30
# Create box plot to visualize sunspot number
sunspot_TS %>% boxplot(horizontal = TRUE, 
                       main = "Figure 1: Yearly Mean Sunspot Number (1700-2023)",
                       xlab = "Mean Sunspot Number", 
                       border = "black")
# Add grid
grid(nx = NULL, ny = NULL, col = "gray", lty = "dotted")

Visualizing data

# Plot sunspot series across time
sunspot_TS %>% plot(type = 'o',
                    main = "Figure 2: Yearly Mean Sunspot Number (1700-2023)", 
                    xlab = "Year", 
                    ylab = "Mean Sunspot Number")

# Add grid
grid(nx = NULL, ny = NULL, col = "gray", lty = "dotted")

ACF and PACF

# Set up a 1x2 layout for plots
par(mfrow=c(1,2))

# Plot ACF plot.
sunspot_TS %>% acf(lag.max = 70,
                   main = "Figure 3.1: Autocorrelation Function (ACF) of Temperature Anomalies",
                   xlab = "Lag", 
                   ylab = "ACF")

# Plot PACF plot.
sunspot_TS %>% pacf(lag.max = 70,
                    main = "Figure 3.2: Partial Autocorrelation Function (PACF) of Temperature Anomalies",
                    xlab = "Lag", 
                    ylab = "PACF")

# Augmented Dickey-Fuller Test for stationarity
sunspot_TS %>% adf.test()
## Warning in adf.test(.): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  .
## Dickey-Fuller = -5.3319, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
# Phillips-Perron Unit Root Test for stationarity
sunspot_TS %>% pp.test()
## Warning in pp.test(.): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  .
## Dickey-Fuller Z(alpha) = -83.347, Truncation lag parameter = 5, p-value
## = 0.01
## alternative hypothesis: stationary
# Plotting a QQ plot for anomaly_TS
sunspot_TS %>% qqnorm(main = "Figure 4: Normal QQ Plot of Anomaly Series")
sunspot_TS %>% qqline(col='blue', lty=2)

# Perform Shapiro-Wilks test for normality. 
rawTS_shapiro <- sunspot_TS %>% shapiro.test()
rawTS_shapiro
## 
##  Shapiro-Wilk normality test
## 
## data:  .
## W = 0.92106, p-value = 4.853e-12

Box-Cox Transformation

# Adding a small positive value to ensure non-zero data
positive_sunspot_TS <- sunspot_TS + (abs(min(sunspot_TS)) + 0.01)

# Applying Box-Cox transformation
BC <- positive_sunspot_TS %>% BoxCox.ar(lambda = seq(-2, 2, 0.01), method = 'yule-walker')

# Add grid
grid(nx = NULL, ny = NULL, col = "gray", lty = "dotted")

# Add main title separately
title(main = 'Figure 5: Optimal Lambda value', adj = 0.5, line = 1.4)

# Use Log-likelihood estimation to find optimal lambda values
lambda <- BC$lambda[which(max(BC$loglike) == BC$loglike)]

# Print results
cat("Lower bound Confidence Interval: ", BC$ci[1],
    "\nUpper bound Confidence Interval: ", BC$ci[2],
    "\n\nOptimal Lambda value: ", lambda)
## Lower bound Confidence Interval:  0.44 
## Upper bound Confidence Interval:  0.56 
## 
## Optimal Lambda value:  0.49
# Set up a 1x2 layout for plots
par(mfrow=c(2,2))

# Plot original anomaly series across time
sunspot_TS %>% plot(type = 'o',
                    main = "Figure 6.1: Original Land Temperature Anomalies (1850-2023)", 
                    xlab = "Year", 
                    ylab = "Temperature Anomalies (°C)")

# Add grid
grid(nx = NULL, ny = NULL, col = "gray", lty = "dotted")

# Plot Box-Cox Transformed anomaly series across time
BC_sunspot_TS <- ((positive_sunspot_TS^lambda) - 1) / lambda

BC_sunspot_TS %>% plot(type = 'o',
                       main = 'Figure 6.2: Box-Cox Transformed Anomaly series (1850-2023)',
                       xlab = 'Year',
                       ylab = 'Transformed Temperature anomalies (°C)')

# Add grid
grid(nx = NULL, ny = NULL, col = "gray", lty = "dotted")

# Test for normality with Box-Cox transformed series.
BC_TS_Stest <- BC_sunspot_TS %>% shapiro.test()

# Plotting a QQ plot for anomaly_TS
sunspot_TS %>% qqnorm(main = "Figure 6.3: Normal QQ Plot of Original Anomaly Series")
sunspot_TS %>% qqline(col='blue', lty=2) %>% 
text(x = 1, y = -0.4, 
     labels = paste0("Shapiro-Wilks (p-value): ", rawTS_shapiro[2]),
     col='blue', cex=1.5)
grid(nx = NULL, ny = NULL, col = "gray", lty = "dotted")

# QQ-Plot Box-Cox transformed series
BC_sunspot_TS %>% qqnorm(main = "Figure 6.4: Normal QQ Plot of Box-Cox Anomaly Series")
BC_sunspot_TS %>% qqline(lty=2, col='blue') %>% 
text(x = 1.2, y = -1.255, 
     labels = paste0("Shapiro-Wilks (p-value): ", BC_TS_Stest[2]),
     col='blue', cex=1.5)
grid(nx = NULL, ny = NULL, col = "gray", lty = "dotted")

# Set up a 1x2 layout for plots
par(mfrow=c(1,2))

# Plot ACF of differenced series
BC_sunspot_TS %>% acf(main = "Figure 8.1: Autocorrelation Function (ACF) of differenced Anomaly series",
                        xlab = "Lag", 
                        ylab = "ACF", lag.max = 70)
grid()

# Plot ACF of differenced series
BC_sunspot_TS %>% pacf(main = "Figure 8.2: Partial Autocorrelation Function (PACF) of differenced Anomaly series",
                         xlab = "Lag", 
                         ylab = "PACF", lag.max = 70)
grid()

model <- Arima(sunspot_TS, order = c(3,0,0), seasonal = list(order=c(2,1,2), period=1))

res_model <- rstandard(model)

plot(res_model, type='o')

par(mfrow=c(1,2))
res_model %>% acf(lag.max = 100)
res_model %>% pacf(lag.max = 100)

model <- auto.arima(sunspot_TS)
r <- forecast(model)

r %>% plot()

model %>% summary()
## Series: sunspot_TS 
## ARIMA(2,0,1) with non-zero mean 
## 
## Coefficients:
##          ar1      ar2      ma1     mean
##       1.4808  -0.7720  -0.1858  78.8401
## s.e.  0.0475   0.0428   0.0705   3.9410
## 
## sigma^2 = 650.2:  log likelihood = -1508.28
## AIC=3026.57   AICc=3026.76   BIC=3045.47
## 
## Training set error measures:
##                        ME     RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set 0.0006686022 25.34141 19.33997 -Inf  Inf 0.6712814 -0.00633822